Installing Required Packages
# install required packages/library
install.packages("matlib")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("rsample")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggplot2")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("corrplot")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("MASS")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
## Warning: package 'MASS' is not available for this version of R
## 'MASS' version 7.3-65 is in the repositories but depends on R (>= 4.4.0)
## 'MASS' version 7.3-65 is in the repositories but depends on R (>= 4.6)
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
install.packages("gridExtra")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("dplyr")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("knitr")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("matrixStats")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
install.packages("ggdensity")
## Installing package into '/home/meutsabdahal/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
Loading Libraries
# import required packages/libraries
library(matlib) # for matrix operations
library(ggplot2) # for data visualization
library(rsample) # for data splitting
library(corrplot) # for correlation visualization
## corrplot 0.95 loaded
library(MASS) # for statistical functions
library(gridExtra) # for arranging plots
library(dplyr) # for data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr) # for tables
library(doParallel) # For parallel processing
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(foreach) # For parallel processing (used with doParallel)
library(ggdensity) # For enhanced 2D density plots/contours
Loading Datasets
# load features dataset (independent variable)
features <- as.matrix(read.csv(file="data/features.csv", header=FALSE))
colnames(features) <- c("x1", "x3", "x4", "x5")
# load target dataset (dependent variable)
target <- as.matrix(read.csv(file="data/target.csv", header=FALSE))
colnames(target) <- c("X2")
# load time series dataset
time <- as.matrix((read.csv(file="data/timeseries.csv", header=FALSE)))
colnames(time) <- c("T1")
# display the first few rows of each dataset
cat("Features Dataset: \n")
## Features Dataset:
head(features)
## x1 x3 x4 x5
## [1,] 8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60
cat("\nTarget Dataset: \n")
##
## Target Dataset:
head(target)
## X2
## [1,] 480.48
## [2,] 445.75
## [3,] 438.76
## [4,] 453.09
## [5,] 464.43
## [6,] 470.96
cat("\nTimeseries Dataset: \n")
##
## Timeseries Dataset:
head(features)
## x1 x3 x4 x5
## [1,] 8.34 40.77 1010.84 90.01
## [2,] 23.64 58.49 1011.40 74.20
## [3,] 29.74 56.90 1007.15 41.91
## [4,] 19.07 49.69 1007.22 76.79
## [5,] 11.80 40.66 1017.13 97.20
## [6,] 13.97 39.16 1016.05 84.60
Task 2: Regression Modeling
Task 2.1: Model Parameters
# OLS helper
fit_ls <- function(X, y) {
# Add checks for singular matrix for robust error handling, though ginv helps.
if (det(t(X) %*% X) == 0) {
warning("Matrix (X'X) is singular. OLS coefficients might not be unique.")
}
ginv(t(X) %*% X) %*% t(X) %*% y
}
# function scales columns and handles cases where a column might be constant
scale_features_matrix <- function(mat) {
if (is.null(mat) || ncol(mat) == 0) {
return(matrix(nrow = nrow(mat), ncol = 0)) # Return empty matrix if input is empty
}
scaled_mat <- scale(mat)
scaled_mat[is.na(scaled_mat)] <- 0
return(scaled_mat)
}
# Build each model and fit
theta_estimates <- list()
# Model 1: y = θ1·x4 + θ2·x3² + θbias
# Create raw feature matrix for this model
X1_raw_features <- cbind(x4 = features[,"x4"],
x3_squared = features[,"x3"]^2)
# Scale these raw features
X1_scaled_features <- scale_features_matrix(X1_raw_features)
# Add the intercept column to the scaled features
X1 <- cbind("(Intercept)" = 1, X1_scaled_features)
# Fit the OLS model
theta_estimates[["Model1"]] <- fit_ls(X1, target)
rownames(theta_estimates[["Model1"]]) <- colnames(X1)
# Model 2: y = θ1·x4 + θ2·x3² + θ3·x5 + θbias
X2_raw_features <- cbind(x4 = features[,"x4"],
x3_squared = features[,"x3"]^2,
x5 = features[,"x5"])
X2_scaled_features <- scale_features_matrix(X2_raw_features)
X2 <- cbind("(Intercept)" = 1, X2_scaled_features)
theta_estimates[["Model2"]] <- fit_ls(X2, target)
rownames(theta_estimates[["Model2"]]) <- colnames(X2)
# Model 3: y = θ1·x3 + θ2·x4 + θ3·x5³ + θbias
X3_raw_features <- cbind(x3 = features[,"x3"],
x4 = features[,"x4"],
x5_cubed = features[,"x5"]^3)
X3_scaled_features <- scale_features_matrix(X3_raw_features)
X3 <- cbind("(Intercept)" = 1, X3_scaled_features)
theta_estimates[["Model3"]] <- fit_ls(X3, target)
rownames(theta_estimates[["Model3"]]) <- colnames(X3)
# Model 4: y = θ1·x4 + θ2·x3² + θ3·x5³ + θbias
X4_raw_features <- cbind(x4 = features[,"x4"],
x3_squared = features[,"x3"]^2,
x5_cubed = features[,"x5"]^3)
X4_scaled_features <- scale_features_matrix(X4_raw_features)
X4 <- cbind("(Intercept)" = 1, X4_scaled_features)
theta_estimates[["Model4"]] <- fit_ls(X4, target)
rownames(theta_estimates[["Model4"]]) <- colnames(X4)
# Model 5: y = θ1·x4 + θ2·x1² + θ3·x3² + θbias
X5_raw_features <- cbind(x4 = features[,"x4"],
x1_squared = features[,"x1"]^2,
x3_squared = features[,"x3"]^2)
X5_scaled_features <- scale_features_matrix(X5_raw_features)
X5 <- cbind("(Intercept)" = 1, X5_scaled_features)
theta_estimates[["Model5"]] <- fit_ls(X5, target)
rownames(theta_estimates[["Model5"]]) <- colnames(X5)
# Print results
for (model in names(theta_estimates)) {
cat("\n", model, "θ̂ =\n")
print(theta_estimates[[model]])
}
##
## Model1 θ̂ =
## X2
## (Intercept) 454.365009
## x4 3.348278
## x3_squared -13.211452
##
## Model2 θ̂ =
## X2
## (Intercept) 454.365009
## x4 3.432657
## x3_squared -12.406622
## x5 2.517326
##
## Model3 θ̂ =
## X2
## (Intercept) 454.365009
## x3 -12.722943
## x4 3.402683
## x5_cubed 2.315840
##
## Model4 θ̂ =
## X2
## (Intercept) 454.365009
## x4 3.487220
## x3_squared -12.402038
## x5_cubed 2.487015
##
## Model5 θ̂ =
## X2
## (Intercept) 454.365009
## x4 1.349107
## x1_squared -10.605331
## x3_squared -5.173124
Task 2.3: Log-likelihood and Variance Calculation
# Constants for Log-Likelihood Calculation
n <- nrow(target) # Number of data samples
ln_2pi <- log(2 * pi) # Pre-calculate ln(2Ï€)
# Compute Log-Likelihood and Variance for each model
log_likelihood_values <- list()
variance_estimates <- list() # List to store calculated variances (sigma_hat_squared)
# Iterate through the RSS values
for (model_name in names(rss_estimates)) {
current_rss <- rss_estimates[[model_name]]
# Only compute log-likelihood if RSS is a valid number
if (!is.na(current_rss) && is.numeric(current_rss)) {
# Calculate sigma_hat_squared
sigma_hat_squared <- current_rss / (n - 1)
variance_estimates[[model_name]] <- sigma_hat_squared # Store the variance
# Check for non-positive or zero sigma_hat_squared before log
if (sigma_hat_squared <= 0) {
warning(paste("Model", model_name, ": sigma_hat_squared is non-positive or zero. Log-likelihood set to NA."))
log_likelihood_values[[model_name]] <- NA
} else {
# Compute the log-likelihood using the given formula
log_likelihood <- - (n / 2) * ln_2pi - (n / 2) * log(sigma_hat_squared) - (1 / (2 * sigma_hat_squared)) * current_rss
log_likelihood_values[[model_name]] <- log_likelihood
}
} else {
log_likelihood_values[[model_name]] <- NA # Set log-likelihood to NA if RSS was invalid
variance_estimates[[model_name]] <- NA # Set variance to NA if RSS was invalid
}
}
cat("\n Log-Likelihood and Variance: \n")
##
## Log-Likelihood and Variance:
for (model_name in names(log_likelihood_values)) {
cat("\n", model_name, "\n")
cat(" Variance (σ̂²):", variance_estimates[[model_name]], "\n")
cat(" Log-Likelihood:", log_likelihood_values[[model_name]], "\n")
}
##
## Model1
## Variance (σ̂²): 68.69951
## Log-Likelihood: -33810.99
##
## Model2
## Variance (σ̂²): 62.96091
## Log-Likelihood: -33393.69
##
## Model3
## Variance (σ̂²): 57.2271
## Log-Likelihood: -32936.88
##
## Model4
## Variance (σ̂²): 63.09509
## Log-Likelihood: -33403.88
##
## Model5
## Variance (σ̂²): 38.21731
## Log-Likelihood: -31005.4
Task 2.4: Compute AIC and BIC
# Constants for AIC/BIC Calculation
n <- nrow(target) # Number of data samples
# 'k' (number of estimated parameters) for each model
k_values <- list(
Model1 = 3, # θ1·x4 + θ2·x3² + θbias
Model2 = 4, # θ1·x4 + θ2·x3² + θ3·x5 + θbias
Model3 = 4, # θ1·x3 + θ2·x4 + θ3·x5³ + θbias
Model4 = 4, # θ1·x4 + θ2·x3² + θ3·x5³ + θbias
Model5 = 4 # θ1·x4 + θ2·x1² + θ3·x3² + θbias
)
# Compute AIC and BIC for each model
aic_values <- list()
bic_values <- list()
# Iterate through the log-likelihood values
for (model_name in names(log_likelihood_values)) {
current_log_likelihood <- log_likelihood_values[[model_name]]
k_param <- k_values[[model_name]] # Get the 'k' value for the current model
# Only compute AIC/BIC if log-likelihood is a valid number and k is defined
if (!is.na(current_log_likelihood) && is.numeric(current_log_likelihood) &&
!is.null(k_param) && !is.na(k_param) && is.numeric(k_param)) {
# Compute AIC: AIC = 2k - 2 ln p(D|θ̂)
aic <- 2 * k_param - 2 * current_log_likelihood
aic_values[[model_name]] <- aic
# Compute BIC: BIC = k * ln(n) - 2 ln p(D|θ̂)
bic <- k_param * log(n) - 2 * current_log_likelihood
bic_values[[model_name]] <- bic
} else {
aic_values[[model_name]] <- NA # Set to NA if inputs were invalid
bic_values[[model_name]] <- NA # Set to NA if inputs were invalid
}
}
cat("\n Akaike Information Criterion (AIC) & Bayesian Information Criterion (BIC): \n")
##
## Akaike Information Criterion (AIC) & Bayesian Information Criterion (BIC):
for (model_name in names(aic_values)) {
cat("\n", model_name, "\n")
cat(" AIC:", aic_values[[model_name]], "\n")
cat(" BIC:", bic_values[[model_name]], "\n")
}
##
## Model1
## AIC: 67627.98
## BIC: 67649.48
##
## Model2
## AIC: 66795.38
## BIC: 66824.05
##
## Model3
## AIC: 65881.77
## BIC: 65910.43
##
## Model4
## AIC: 66815.75
## BIC: 66844.42
##
## Model5
## AIC: 62018.79
## BIC: 62047.46
Task 2.5: Distribution of Model Prediction Errors
# Model definitions to reconstruct X matrices for calculating residuals.
model_x_builders <- list(
Model1 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model2 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2, x5 = features[,"x5"])
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model3 = function(features, scale_func) {
X_raw <- cbind(x3 = features[,"x3"], x4 = features[,"x4"], x5_cubed = features[,"x5"]^3)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model4 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x3_squared = features[,"x3"]^2, x5_cubed = features[,"x5"]^3)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
},
Model5 = function(features, scale_func) {
X_raw <- cbind(x4 = features[,"x4"], x1_squared = features[,"x1"]^2, x3_squared = features[,"x3"]^2)
X_scaled <- scale_func(X_raw)
cbind("(Intercept)" = 1, X_scaled)
}
)
# Loop through each model to calculate residuals and generate Q-Q plots
for (model_name in names(model_x_builders)) {
# Reconstruct the design matrix X using the builder function and global helpers
X <- model_x_builders[[model_name]](features, scale_features_matrix)
# Retrieve the estimated parameters (theta) for the current model
theta <- theta_estimates[[model_name]]
# Check if theta is valid and has correct dimensions before calculating residuals
if (is.null(theta) || any(is.na(theta)) || length(theta) != ncol(X)) {
warning(paste("Cannot compute residuals for", model_name, ": Invalid, missing, or dimension-mismatched theta estimates."))
next
}
# Ensure theta is a matrix for matrix multiplication
if (is.vector(theta)) {
theta <- matrix(theta, ncol = 1)
}
# Calculate predicted values (y_hat)
y_pred <- X %*% theta
# Calculate residuals (prediction errors): y - y_hat
residuals <- target - y_pred
# Flatten residuals to a vector for qqnorm function
residuals_vec <- as.vector(residuals)
# Check for NA/NaN/Inf in residuals before plotting
if (any(is.na(residuals_vec)) || any(is.infinite(residuals_vec))) {
warning(paste("Residuals for", model_name, "contain NA/Inf values. Q-Q plot may not be generated properly."))
next
}
# Plot Q-Q plot
qqnorm(residuals_vec, main = paste("Q-Q Plot of Residuals for", model_name))
qqline(residuals_vec, col = "red")
}





Task 2.6: Selection of Best Regression Model
# Compile all model evaluation metrics into a comprehensive table
model_comparison <- data.frame(
Model = names(theta_estimates), # Using names from theta_estimates
Parameters = sapply(theta_estimates, function(theta) length(theta[!is.na(theta)])),
RSS = sapply(rss_estimates, function(rss) ifelse(is.null(rss) || length(rss) == 0 || is.na(rss), NA, rss)),
LogLikelihood = sapply(log_likelihood_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)),
AIC = sapply(aic_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)),
BIC = sapply(bic_values, function(val) ifelse(is.null(val) || is.na(val), NA, val)),
Variance = sapply(variance_estimates, function(val) ifelse(is.null(val) || is.na(val), NA, val))
)
# Display the comprehensive table
cat("\n Comprehensive Comparison of All Models \n")
##
## Comprehensive Comparison of All Models
print(kable(model_comparison,
caption = "Comprehensive Comparison of All Models",
digits = c(0, 0, 2, 2, 2, 2, 4)))
##
##
## Table: Comprehensive Comparison of All Models
##
## | |Model | Parameters| RSS| LogLikelihood| AIC| BIC| Variance|
## |:------|:------|----------:|--------:|-------------:|--------:|--------:|--------:|
## |Model1 |Model1 | 3| 657248.2| -33810.99| 67627.98| 67649.48| 68.6995|
## |Model2 |Model2 | 4| 602347.1| -33393.69| 66795.38| 66824.05| 62.9609|
## |Model3 |Model3 | 4| 547491.6| -32936.88| 65881.77| 65910.43| 57.2271|
## |Model4 |Model4 | 4| 603630.7| -33403.88| 66815.75| 66844.42| 63.0951|
## |Model5 |Model5 | 4| 365625.0| -31005.40| 62018.79| 62047.46| 38.2173|
cat("\n")
# Rank models based on different criteria
rank_rss <- rank(model_comparison$RSS, na.last = "keep", ties.method = "min")
rank_aic <- rank(model_comparison$AIC, na.last = "keep", ties.method = "min")
rank_bic <- rank(model_comparison$BIC, na.last = "keep", ties.method = "min")
rank_loglik <- rank(-model_comparison$LogLikelihood, na.last = "keep", ties.method = "min") # Negative because higher is better
ranking_table <- data.frame(
Model = model_comparison$Model,
RSS_Rank = rank_rss,
AIC_Rank = rank_aic,
BIC_Rank = rank_bic,
LogLikelihood_Rank = rank_loglik
)
# Calculate Overall_Rank by summing individual ranks, considering only valid ranks
ranking_table$Overall_Rank <- rowSums(ranking_table[, c("RSS_Rank", "AIC_Rank", "BIC_Rank", "LogLikelihood_Rank")], na.rm = TRUE)
# Sort by overall rank (lower sum is better)
ranking_table <- ranking_table[order(ranking_table$Overall_Rank),]
# Display the ranking table
cat("\n Model Ranking Based on Different Criteria (Lower Sum of Ranks is Better) \n")
##
## Model Ranking Based on Different Criteria (Lower Sum of Ranks is Better)
print(kable(ranking_table,
caption = "Model Ranking Based on Different Criteria (Lower is Better)"))
##
##
## Table: Model Ranking Based on Different Criteria (Lower is Better)
##
## | |Model | RSS_Rank| AIC_Rank| BIC_Rank| LogLikelihood_Rank| Overall_Rank|
## |:--|:------|--------:|--------:|--------:|------------------:|------------:|
## |5 |Model5 | 1| 1| 1| 1| 4|
## |3 |Model3 | 2| 2| 2| 2| 8|
## |2 |Model2 | 3| 3| 3| 3| 12|
## |4 |Model4 | 4| 4| 4| 4| 16|
## |1 |Model1 | 5| 5| 5| 5| 20|
cat("\n")
# Determine the best model based on individual criteria
best_model_aic_index <- which.min(model_comparison$AIC)
best_model_aic_name <- model_comparison$Model[best_model_aic_index]
best_model_bic_index <- which.min(model_comparison$BIC)
best_model_bic_name <- model_comparison$Model[best_model_bic_index]
best_model_loglik_index <- which.max(model_comparison$LogLikelihood)
best_model_loglik_name <- model_comparison$Model[best_model_loglik_index]
best_model_rss_index <- which.min(model_comparison$RSS)
best_model_rss_name <- model_comparison$Model[best_model_rss_index]
cat(paste("Based on AIC, the best model is:", best_model_aic_name, "\n"))
## Based on AIC, the best model is: Model5
cat(paste("Based on BIC, the best model is:", best_model_bic_name, "\n"))
## Based on BIC, the best model is: Model5
cat(paste("Based on Log-Likelihood, the best model is:", best_model_loglik_name, "\n"))
## Based on Log-Likelihood, the best model is: Model5
cat(paste("Based on RSS, the best model is:", best_model_rss_name, "\n"))
## Based on RSS, the best model is: Model5
Task 2.7: Train-Test Split
set.seed(123) # Set seed for reproducibility of data splitting
# Split the input (features) and output (target) dataset
sample_size <- nrow(features)
train_indices <- sample(sample_size, size = round(0.7 * sample_size))
test_indices <- setdiff(1:sample_size, train_indices)
# Create training and testing datasets
train_features <- features[train_indices, ]
train_target <- target[train_indices, ]
test_features <- features[test_indices, ]
y_test <- target[test_indices, ]
cat(paste0("Data split: Training samples = ", nrow(train_features), ", Testing samples = ", nrow(test_features), "\n"))
## Data split: Training samples = 6698, Testing samples = 2870
# Use the best model and estimate its parameters using the training dataset
best_model_name <- best_model_aic_name
# Reconstruct the design matrix (X_train) for the best model using training featuresn.
if (!exists("model_x_builders") || is.null(model_x_builders[[best_model_name]])) {
stop("model_x_builders or the builder for the best model is not available. Please ensure Task 2.5 code is run first.")
}
X_train_model <- model_x_builders[[best_model_name]](train_features, scale_features_matrix)
# Estimate model parameters use the training dataset
theta_train <- fit_ls(X_train_model, train_target)
rownames(theta_train) <- colnames(X_train_model)
cat(paste0("\nEstimated parameters for ", best_model_name, " using training data:\n"))
##
## Estimated parameters for Model5 using training data:
print(theta_train)
## [,1]
## (Intercept) 454.354339
## x4 1.280183
## x1_squared -10.646216
## x3_squared -5.149130
# Compute the model's output/prediction on the testing data
X_test_model <- model_x_builders[[best_model_name]](test_features, scale_features_matrix)
# Compute predictions on the testing data
y_pred_test <- X_test_model %*% theta_train
# Calculate Test Error Metrics
test_rss <- sum((y_test - y_pred_test)^2)
test_rmse <- sqrt(mean((y_test - y_pred_test)^2))
test_mae <- mean(abs(y_test - y_pred_test))
test_r_squared <- 1 - sum((y_test - y_pred_test)^2) / sum((y_test - mean(y_test))^2)
# Display test metrics
test_metrics <- data.frame(
Metric = c("RSS", "RMSE", "MAE", "R-squared"),
Value = c(test_rss, test_rmse, test_mae, test_r_squared)
)
print(kable(test_metrics, caption = paste0("Model Performance on Test Data for ", best_model_name), format = "pipe", digits = 3))
##
##
## Table: Model Performance on Test Data for Model5
##
## |Metric | Value|
## |:---------|----------:|
## |RSS | 106713.984|
## |RMSE | 6.098|
## |MAE | 4.859|
## |R-squared | 0.873|
cat("\n")
# Compute 95% (model prediction) confidence intervals
n_train <- nrow(X_train_model)
k_params <- ncol(X_train_model) # Number of estimated parameters including intercept
# Calculate residuals from the training fit to estimate sigma_hat_squared
residuals_train <- train_target - (X_train_model %*% theta_train)
rss_train <- sum(residuals_train^2)
# Estimate variance of residuals (sigma_hat_squared) from training data
sigma_hat_squared_train <- rss_train / (n_train - k_params)
sigma_hat_train <- sqrt(sigma_hat_squared_train)
# Calculate the inverse of (X_train_model'X_train_model)
XTX_inv_train <- tryCatch(
solve(t(X_train_model) %*% X_train_model),
error = function(e) {
warning("Matrix (X'X) is singular for training data. Using ginv from MASS for inverse.")
ginv(t(X_train_model) %*% X_train_model)
}
)
# Degrees of freedom for t-distribution
df <- n_train - k_params
if (df <= 0) {
warning("Degrees of freedom are zero or negative. Cannot compute confidence intervals.")
lower_ci <- rep(NA, nrow(y_pred_test))
upper_ci <- rep(NA, nrow(y_pred_test))
} else {
# T-value for 95% confidence (alpha = 0.05, two-tailed)
t_critical <- qt(0.975, df = df)
# Calculate prediction intervals for each test point
lower_ci <- numeric(nrow(X_test_model))
upper_ci <- numeric(nrow(X_test_model))
for (i in 1:nrow(X_test_model)) {
x0_t <- matrix(X_test_model[i, ], nrow = 1) # Transpose to row vector
# Variance of the prediction for a new observation
pred_variance <- sigma_hat_squared_train * (1 + x0_t %*% XTX_inv_train %*% t(x0_t))
pred_sd <- sqrt(pred_variance)
margin_of_error <- t_critical * pred_sd
lower_ci[i] <- y_pred_test[i] - margin_of_error
upper_ci[i] <- y_pred_test[i] + margin_of_error
}
}
# Convert to vectors for plotting consistency in ggplot
y_test_vec <- as.vector(y_test)
y_pred_test_vec <- as.vector(y_pred_test)
# Prepare data for ggplot
plot_data <- data.frame(
index = seq_along(y_test_vec), # Use seq_along on the vector
Actual = y_test_vec, # Use the vector directly
Predicted = y_pred_test_vec, # Use the vector directly
LowerCI = lower_ci,
UpperCI = upper_ci
)
# Define how many rows to display
num_display_rows <- min(10, nrow(plot_data)) # Display up to 10 rows
# Select the sample rows and display them using kable for nice formatting
sample_display_df <- plot_data[1:num_display_rows, c("Actual", "Predicted", "LowerCI", "UpperCI")]
print(kable(sample_display_df,
caption = paste0("Sample of ", num_display_rows, " Test Predictions with 95% Confidence Intervals for ", best_model_name),
format = "pipe",
digits = 3)) # Adjust digits for desired precision
##
##
## Table: Sample of 10 Test Predictions with 95% Confidence Intervals for Model5
##
## | Actual| Predicted| LowerCI| UpperCI|
## |------:|---------:|-------:|-------:|
## | 470.96| 469.773| 457.578| 481.967|
## | 451.41| 454.666| 442.470| 466.862|
## | 473.74| 473.549| 461.355| 485.744|
## | 441.76| 440.817| 428.622| 453.011|
## | 474.71| 476.158| 463.961| 488.354|
## | 487.69| 478.450| 466.253| 490.647|
## | 452.16| 451.566| 439.373| 463.760|
## | 483.26| 476.192| 463.997| 488.387|
## | 433.59| 430.476| 418.278| 442.675|
## | 435.14| 443.392| 431.194| 455.591|
cat("\n")
# Generate ggplot
ggplot(plot_data, aes(x = index)) +
geom_ribbon(aes(ymin = LowerCI, ymax = UpperCI, fill = "95% CI"), alpha = 0.2, na.rm = TRUE) + # Shaded CI
geom_line(aes(y = Predicted, color = "Predicted"), linetype = "dashed", size = 0.8, na.rm = TRUE) + # Predicted line
geom_point(aes(y = Actual, color = "Actual"), size = 1.0, alpha = 0.7, na.rm = TRUE) + # Actual points
scale_color_manual(name = "colour",
values = c("Actual" = "blue", "Predicted" = "red")) +
scale_fill_manual(name = "fill", values = c("95% CI" = "grey")) +
labs(
title = paste0(best_model_name, " : Actual vs. Predicted with 95% CI (Test Set)"), # Match title style
x = "index", # Match x-axis label style
y = "Net hourly output" # Match y-axis label style
) +
theme_minimal() + # Use a minimal theme
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.text = element_text(size = 9)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Task 3: Approximate Bayesian Computation (ABC)
Posterior Distribution and Parameter Estimation
# Specify Model 5 as the best model
best_model_name_abc <- "Model5"
cat(paste0("Using '", best_model_name_abc, "' as the best model for ABC.\n"))
## Using 'Model5' as the best model for ABC.
# Ensure the model_x_builder for Model 5 is available
if (!exists("model_x_builders") || is.null(model_x_builders[[best_model_name_abc]])) {
stop("model_x_builders or the builder for Model5 is not available. Please ensure Task 2.5 code is run first.")
}
# Ensures that the ABC is based on a training dataset
set.seed(123) # for reproducibility of data split
data_full <- as.data.frame(cbind(features, target))
data_split <- initial_split(data_full, prop = 0.70) # 70% for training
features_train <- as.matrix(training(data_split)[, colnames(features)])
target_train <- as.matrix(training(data_split)[, colnames(target)])
features_test <- as.matrix(testing(data_split)[, colnames(features)])
target_test <- as.matrix(testing(data_split)[, colnames(target)])
cat(paste0("\nData split into training (", nrow(features_train), " samples) and testing (", nrow(features_test), " samples).\n"))
##
## Data split into training (6697 samples) and testing (2871 samples).
# Construct the design matrix for Model 5 using the TRAINING dataset
X_train_full_design_matrix <- model_x_builders[[best_model_name_abc]](features_train, scale_features_matrix)
# Check for valid column names in X_train_full_design_matrix
if (is.null(colnames(X_train_full_design_matrix)) || any(is.na(colnames(X_train_full_design_matrix))) || any(colnames(X_train_full_design_matrix) == "")) {
stop("The design matrix (X_train_full_design_matrix) for Model5 contains invalid column names (NULL, NA, or empty string). Please ensure valid, non-empty column names are generated by model_x_builders.")
}
# Estimate parameters for Model 5 using the TRAINING dataset
theta_estimate_model_5_train <- fit_ls(X_train_full_design_matrix, target_train)
rownames(theta_estimate_model_5_train) <- colnames(X_train_full_design_matrix)
cat(paste0("\nLeast squares estimates for ", best_model_name_abc, " (training dataset):\n"))
##
## Least squares estimates for Model5 (training dataset):
print(theta_estimate_model_5_train)
## [,1]
## (Intercept) 454.356682
## x4 1.279909
## x1_squared -10.646107
## x3_squared -5.149565
# Convert to a data frame for easier manipulation and sorting
theta_df <- data.frame(
Parameter = rownames(theta_estimate_model_5_train),
Estimate = as.vector(theta_estimate_model_5_train)
)
theta_df$AbsEstimate <- abs(theta_df$Estimate)
theta_df_ordered <- theta_df[order(-theta_df$AbsEstimate), ]
# Select the top 2 parameters with largest absolute values to vary
params_to_vary_info <- theta_df_ordered[1:2, ]
params_to_vary_names <- params_to_vary_info$Parameter
# Explicitly filter out any invalid parameter names
params_to_vary_names <- params_to_vary_names[!is.na(params_to_vary_names) & params_to_vary_names != "" & !is.null(params_to_vary_names)]
# Re-check the length after filtering to ensure at least 2 valid names remain
if (length(params_to_vary_names) < 2) {
stop("After validation and filtering, fewer than 2 valid parameter names remain for ABC. Please check the model definition and data for issues with parameter names or coefficients.")
}
# Ensure selected parameter names exist in the theta estimate's row names
if (!all(params_to_vary_names %in% rownames(theta_estimate_model_5_train))) {
stop(paste0("Error: Selected parameters to vary (", paste(params_to_vary_names, collapse = ", "), ") do not exist as row names in the training model's theta estimate. Available names: ", paste(rownames(theta_estimate_model_5_train), collapse = ", "), ". This might indicate an issue with feature matrix column naming or theta estimation."))
}
# The remaining parameters are fixed to their estimated values
if (nrow(theta_df_ordered) > 2) {
fixed_params_info <- theta_df_ordered[3:nrow(theta_df_ordered), ]
fixed_params_values <- setNames(fixed_params_info$Estimate, fixed_params_info$Parameter)
} else {
fixed_params_info <- data.frame(Parameter = character(0), Estimate = numeric(0), AbsEstimate = numeric(0))
fixed_params_values <- setNames(numeric(0), character(0))
}
cat(paste0("\nParameters to vary in ABC:\n"))
##
## Parameters to vary in ABC:
print(params_to_vary_info)
## Parameter Estimate AbsEstimate
## 1 (Intercept) 454.35668 454.35668
## 3 x1_squared -10.64611 10.64611
cat(paste0("\nFixed parameters (and their estimated values):\n"))
##
## Fixed parameters (and their estimated values):
print(fixed_params_values)
## x3_squared x4
## -5.149565 1.279909
# Retrieve the LS estimates for the two parameters to be varied for prior centering
prior_mean_param1 <- theta_estimate_model_5_train[params_to_vary_names[1], 1]
prior_mean_param2 <- theta_estimate_model_5_train[params_to_vary_names[2], 1]
# Define uniform prior ranges: centered around LS estimate, with a minimum width
prior_range_half_param1 <- max(5 * abs(prior_mean_param1), 0.5) # Ensures exploration even if LS estimate is small
prior_range_half_param2 <- max(5 * abs(prior_mean_param2), 0.5)
prior_param1_min <- prior_mean_param1 - prior_range_half_param1
prior_param1_max <- prior_mean_param1 + prior_range_half_param1
prior_param2_min <- prior_mean_param2 - prior_range_half_param2
prior_param2_max <- prior_mean_param2 + prior_range_half_param2
priors_abc <- list(
param1 = list(name = params_to_vary_names[1], type = "uniform", min = prior_param1_min, max = prior_param1_max),
param2 = list(name = params_to_vary_names[2], type = "uniform", min = prior_param2_min, max = prior_param2_max)
)
# Display the prior parameter names and their uniform ranges
cat("\nPrior parameters (Uniform Distribution Ranges):\n")
##
## Prior parameters (Uniform Distribution Ranges):
cat(paste0("- ", priors_abc$param1$name, ": [", round(priors_abc$param1$min, 4), ", ", round(priors_abc$param1$max, 4), "]\n"))
## - (Intercept): [-1817.4267, 2726.1401]
cat(paste0("- ", priors_abc$param2$name, ": [", round(priors_abc$param2$min, 4), ", ", round(priors_abc$param2$max, 4), "]\n"))
## - x1_squared: [-63.8766, 42.5844]
# Function to sample a single set of parameters from the defined priors
sample_from_priors <- function() {
param_samples <- c()
param_samples[priors_abc$param1$name] <- runif(1, priors_abc$param1$min, priors_abc$param1$max)
param_samples[priors_abc$param2$name] <- runif(1, priors_abc$param2$min, priors_abc$param2$max)
return(param_samples)
}
# Calculate the observed summary statistic (RSS) for the OLS model on the TRAINING data.
y_pred_obs_train_model <- X_train_full_design_matrix %*% theta_estimate_model_5_train
residuals_obs_train_model <- target_train - y_pred_obs_train_model
observed_rss_train_model <- sum(residuals_obs_train_model^2)
cat(paste0("\nObserved RSS of ", best_model_name_abc, " on training dataset: ", observed_rss_train_model, "\n"))
##
## Observed RSS of Model5 on training dataset: 258937.811114784
Implement Rejection ABC Algorithm
# Function to calculate RSS for a given set of simulated parameters.
calculate_simulated_rss <- function(simulated_theta_vector, X_matrix, observed_y) {
# Compute predictions using the simulated parameters and the design matrix
y_simulated <- X_matrix %*% simulated_theta_vector
# Calculate residuals by comparing simulated predictions to the actual observed data
residuals_simulated <- observed_y - y_simulated
# Compute the sum of squared residuals
rss <- sum(residuals_simulated^2)
return(rss)
}
cat(paste0("Summary statistic chosen: Residual Sum of Squares (RSS).\n"))
## Summary statistic chosen: Residual Sum of Squares (RSS).
cat(paste0("The observed RSS calculated on training data is: ", observed_rss_train_model, "\n"))
## The observed RSS calculated on training data is: 258937.811114784
# Define ABC parameters
n_abc_simulations <- 5000000 # Number of simulations to run for ABC
epsilon <- 1.5 * observed_rss_train_model
cat(paste0("Epsilon (tolerance for acceptance): ", epsilon, "\n"))
## Epsilon (tolerance for acceptance): 388406.716672176
# Initialize list to collect accepted samples
accepted_samples_list <- list()
set.seed(42) # Set seed for reproducibility of random sampling within ABC
# Using a for loop with explicit acceptance based on epsilon
cat(paste0("\nStarting ABC simulations (", n_abc_simulations, " iterations).\n"))
##
## Starting ABC simulations (5e+06 iterations).
pb <- txtProgressBar(min = 0, max = n_abc_simulations, style = 3) # Progress bar
## | | | 0%
# Define all parameter names, including fixed ones, for constructing current_theta_abc
all_param_names <- rownames(theta_estimate_model_5_train)
for (i in 1:n_abc_simulations) {
# Sample parameters from the prior distribution
proposed_theta_varying <- sample_from_priors()
# Create a full parameter vector for the current simulation
current_theta_abc <- theta_estimate_model_5_train # Start with training LS estimates
current_theta_abc[params_to_vary_names[1], 1] <- proposed_theta_varying[params_to_vary_names[1]]
current_theta_abc[params_to_vary_names[2], 1] <- proposed_theta_varying[params_to_vary_names[2]]
# Calculate the summary statistic for the simulated data using TRAINING data
simulated_rss <- calculate_simulated_rss(
simulated_theta_vector = current_theta_abc,
X_matrix = X_train_full_design_matrix, # Use training design matrix
observed_y = target_train # Use training target
)
# Rejection step: accept if simulated RSS is less than epsilon
if (simulated_rss < epsilon) {
# Store the accepted sample (the two varying parameters and their RSS)
temp_data_list <- list()
temp_data_list[[params_to_vary_names[1]]] <- proposed_theta_varying[params_to_vary_names[1]]
temp_data_list[[params_to_vary_names[2]]] <- proposed_theta_varying[params_to_vary_names[2]]
temp_data_list[["rss"]] <- simulated_rss
accepted_samples_list[[length(accepted_samples_list) + 1]] <- as.data.frame(temp_data_list)
}
setTxtProgressBar(pb, i) # Update progress bar
}
## | | | 1% | |= | 1% | |= | 2% | |== | 2% | |== | 3% | |== | 4% | |=== | 4% | |=== | 5% | |==== | 5% | |==== | 6% | |===== | 6% | |===== | 7% | |===== | 8% | |====== | 8% | |====== | 9% | |======= | 9% | |======= | 10% | |======= | 11% | |======== | 11% | |======== | 12% | |========= | 12% | |========= | 13% | |========= | 14% | |========== | 14% | |========== | 15% | |=========== | 15% | |=========== | 16% | |============ | 16% | |============ | 17% | |============ | 18% | |============= | 18% | |============= | 19% | |============== | 19% | |============== | 20% | |============== | 21% | |=============== | 21% | |=============== | 22% | |================ | 22% | |================ | 23% | |================ | 24% | |================= | 24% | |================= | 25% | |================== | 25% | |================== | 26% | |=================== | 26% | |=================== | 27% | |=================== | 28% | |==================== | 28% | |==================== | 29% | |===================== | 29% | |===================== | 30% | |===================== | 31% | |====================== | 31% | |====================== | 32% | |======================= | 32% | |======================= | 33% | |======================= | 34% | |======================== | 34% | |======================== | 35% | |========================= | 35% | |========================= | 36% | |========================== | 36% | |========================== | 37% | |========================== | 38% | |=========================== | 38% | |=========================== | 39% | |============================ | 39% | |============================ | 40% | |============================ | 41% | |============================= | 41% | |============================= | 42% | |============================== | 42% | |============================== | 43% | |============================== | 44% | |=============================== | 44% | |=============================== | 45% | |================================ | 45% | |================================ | 46% | |================================= | 46% | |================================= | 47% | |================================= | 48% | |================================== | 48% | |================================== | 49% | |=================================== | 49% | |=================================== | 50% | |=================================== | 51% | |==================================== | 51% | |==================================== | 52% | |===================================== | 52% | |===================================== | 53% | |===================================== | 54% | |====================================== | 54% | |====================================== | 55% | |======================================= | 55% | |======================================= | 56% | |======================================== | 56% | |======================================== | 57% | |======================================== | 58% | |========================================= | 58% | |========================================= | 59% | |========================================== | 59% | |========================================== | 60% | |========================================== | 61% | |=========================================== | 61% | |=========================================== | 62% | |============================================ | 62% | |============================================ | 63% | |============================================ | 64% | |============================================= | 64% | |============================================= | 65% | |============================================== | 65% | |============================================== | 66% | |=============================================== | 66% | |=============================================== | 67% | |=============================================== | 68% | |================================================ | 68% | |================================================ | 69% | |================================================= | 69% | |================================================= | 70% | |================================================= | 71% | |================================================== | 71% | |================================================== | 72% | |=================================================== | 72% | |=================================================== | 73% | |=================================================== | 74% | |==================================================== | 74% | |==================================================== | 75% | |===================================================== | 75% | |===================================================== | 76% | |====================================================== | 76% | |====================================================== | 77% | |====================================================== | 78% | |======================================================= | 78% | |======================================================= | 79% | |======================================================== | 79% | |======================================================== | 80% | |======================================================== | 81% | |========================================================= | 81% | |========================================================= | 82% | |========================================================== | 82% | |========================================================== | 83% | |========================================================== | 84% | |=========================================================== | 84% | |=========================================================== | 85% | |============================================================ | 85% | |============================================================ | 86% | |============================================================= | 86% | |============================================================= | 87% | |============================================================= | 88% | |============================================================== | 88% | |============================================================== | 89% | |=============================================================== | 89% | |=============================================================== | 90% | |=============================================================== | 91% | |================================================================ | 91% | |================================================================ | 92% | |================================================================= | 92% | |================================================================= | 93% | |================================================================= | 94% | |================================================================== | 94% | |================================================================== | 95% | |=================================================================== | 95% | |=================================================================== | 96% | |==================================================================== | 96% | |==================================================================== | 97% | |==================================================================== | 98% | |===================================================================== | 98% | |===================================================================== | 99% | |======================================================================| 99% | |======================================================================| 100%
close(pb) # Close progress bar
# Combine all accepted samples from the list into a single data frame
accepted_samples_df <- do.call(rbind, accepted_samples_list)
# Check if any samples were accepted
if (is.null(accepted_samples_df) || nrow(accepted_samples_df) == 0) {
cat("\nNo samples were accepted. Try increasing n_abc_simulations, widening priors, or increasing epsilon.\n")
# Create an empty data frame with correct column names if no samples accepted
posterior_samples <- data.frame(matrix(ncol = 2, nrow = 0))
colnames(posterior_samples) <- params_to_vary_names
} else {
cat(paste("\nNumber of accepted samples:", nrow(accepted_samples_df), "out of", n_abc_simulations, "\n"))
cat(paste("Acceptance rate:", round(nrow(accepted_samples_df) / n_abc_simulations * 100, 2), "%\n"))
# Store the accepted parameter samples for subsequent plotting
posterior_samples <- accepted_samples_df[, 1:2]
colnames(posterior_samples) <- params_to_vary_names
}
##
## Number of accepted samples: 664 out of 5e+06
## Acceptance rate: 0.01 %
Marginal and Joint Posterior Distribution
# Check if posterior_samples is empty before plotting
if (is.null(posterior_samples) || nrow(posterior_samples) == 0) {
cat("\nCannot plot posterior distributions: No samples were accepted in ABC.\n")
} else {
# Convert the collected posterior samples into a data frame suitable for ggplot
posterior_df <- as.data.frame(posterior_samples)
# Plot Marginal Posterior Distribution for the First Varying Parameter
p_marginal1 <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[1]]])) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightblue", color = "black") +
geom_density(color = "darkblue", linewidth = 1) +
labs(
title = paste("Marginal Posterior for Parameter:", params_to_vary_names[1]),
x = paste("Value of", params_to_vary_names[1]),
y = "Density"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_marginal1)
# Plot Marginal Posterior Distribution for the Second Varying Parameter
p_marginal2 <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[2]]])) +
geom_histogram(aes(y = after_stat(density)), bins = 30, fill = "lightgreen", color = "black") +
geom_density(color = "darkgreen", linewidth = 1) +
labs(
title = paste("Marginal Posterior for Parameter:", params_to_vary_names[2]),
x = paste("Value of", params_to_vary_names[2]),
y = "Density"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_marginal2)
# Plot the Joint Posterior Distribution of the Two Varying Parameters
p_joint <- ggplot(posterior_df, aes(x = .data[[params_to_vary_names[1]]], y = .data[[params_to_vary_names[2]]])) +
geom_point(alpha = 0.2, color = "purple") +
geom_density_2d(color = "darkblue", linewidth = 0.8) +
labs(
title = paste("Joint Posterior Distribution of", params_to_vary_names[1], "and", params_to_vary_names[2]),
x = paste("Value of", params_to_vary_names[1]),
y = paste("Value of", params_to_vary_names[2])
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
print(p_joint)
}


